global projectdir "~"
global datadir "$projectdir/data"

cd $datadir/raw_pulls/w2_ssl_usiris/

foreach year of numlist 2002(1)2016 {
disp `year'
use w2_ssl_su_`year'_usiris_2018q4, clear

if `year'<=2001{
keep ein lfo recnum name1 name2 
}

*Add in other firm var from lbdfirms
*LBD data year 2016 Frims data
gen firmid = "0" + ein
merge m:1 firmid using $datadir/raw_pulls/lbd_firms/firm_`year'_emp_c201600.dta, keepusing(estabs) update
gen in_lbd_firm = 1 if _merge>1
drop if _merge==2
drop _merge firmid estabs

*Drop dublicates by ein
if `year'>2001{
duplicates tag ein, gen(dup_ein)
duplicates tag ein einunit_id_actv_stat, gen(dup_ein_actv)
drop if dup_ein>0 & dup_ein_actv==0 & einunit_id_actv_stat=="N" & in_lbd_firm!=1
drop if dup_ein>1 & dup_ein_actv>0 & einunit_id_actv_stat=="N" & in_lbd_firm!=1
drop dup_ein dup_ein_actv
duplicates tag ein, gen(dup_ein)
duplicates tag ein einunit_id_actv_stat, gen(dup_ein_actv)
drop if dup_ein>0 & dup_ein_actv==0 & einunit_id_actv_stat=="N"
drop if dup_ein>1 & dup_ein_actv>0 & einunit_id_actv_stat=="N"

duplicates tag ein, gen(dup_ein_1)
drop if dup_ein_1>0 & remp==. & in_lbd_firm!=1
drop dup_ein_1
duplicates tag ein, gen(dup_ein_1)
drop if dup_ein_1>0 & remp==.

duplicates tag ein, gen(dup_ein_2)
drop if dup_ein_2>0 &  empunit_id_actv_stat!="Y" & in_lbd_firm!=1
drop dup_ein_2
duplicates tag ein, gen(dup_ein_2)
drop if dup_ein_2>0 &  empunit_id_actv_stat!="Y"
keep ein lfo recnum name1 name2 empunit_typ
}
rename * su_*
rename su_ein ein

tempfile su_eins
save "`su_eins'"

*There are eins assoated with with more than one firmid
 *This tried to clean that up
use w2_ssl_mu_`year'_usiris_2018q4, clear

if `year'<=2001{
keep alpha ein act remp
}
if `year'>2001 & `year'<=2002{
keep alpha ein act remp inscope_cbp_stat
}
if `year'>2002{
keep alpha ein act remp cbp
}

gen firmid = alpha + "0000"
merge m:1 firmid using $datadir/raw_pulls/lbd_firms/firm_`year'_emp_c201600.dta, keepusing(estabs) update
gen in_lbd_firm = 1 if _merge>1
drop if _merge==2
drop _merge firmid estabs

gen d_remp=(remp>0 & remp!=.)
replace d_remp=. if remp==.
drop remp
duplicates drop

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & d_remp==. & in_lbd_firm!=1
drop dup_ein

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & d_remp==0 & in_lbd_firm!=1
drop dup_ein

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & d_remp==.
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & d_remp==0
drop dup_ein

duplicates tag ein, gen(dup_ein)

if `year'<=2001{
drop if dup_ein>0 & act=="G" & in_lbd_firm!=1
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="D" & in_lbd_firm!=1
drop dup_ein

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="G"
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="D"
drop dup_ein act

duplicates drop
duplicates tag ein, gen(dup_ein)

tempfile mu_dup_eins
save "`mu_dup_eins'"

drop if dup_ein>0
}
if `year'>2001 & `year'<=2002{
drop if dup_ein>0 & inscope_cbp_stat=="N" & in_lbd_firm!=1
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="N" & in_lbd_firm!=1
drop dup_ein

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & inscope_cbp_stat=="N"
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="N"
drop dup_ein

duplicates drop
duplicates tag ein, gen(dup_ein)

tempfile mu_dup_eins
save "`mu_dup_eins'"

drop if dup_ein>0
}
if `year'>2002{
drop if dup_ein>0 & cbp==0 & in_lbd_firm!=1
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="N" & in_lbd_firm!=1
drop dup_ein

duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & cbp==0
drop dup_ein
duplicates tag ein, gen(dup_ein)
drop if dup_ein>0 & act=="N"
drop dup_ein
duplicates tag ein, gen(dup_ein)

*redated indivual alpha fixes for 2011 and 2013
drop if dup_ein>0 & alpha=="~"

drop dup_ein
duplicates drop
duplicates tag ein, gen(dup_ein)

tempfile mu_dup_eins
save "`mu_dup_eins'"

drop if dup_ein>0
}

keep alpha ein
duplicates drop 

tempfile mu_eins
save "`mu_eins'"

*Deal with the left over duplicate EINS
use "`mu_dup_eins'"
keep if dup_ein>0
summarize

local n_var = `r(N)'
if `n_var'>0 {


gen firmid = alpha + "0000"

merge m:1 firmid using $datadir/raw_pulls/lbd_firms/firm_`year'_emp_c201600.dta, update
drop if _merge==2
drop _merge

*Just use age and size
order alpha
keep alpha-startdiff
drop dup_ein
duplicates drop

*Larger more or same number of estabs
foreach num of numlist 1(1)10 {
duplicates tag ein, gen(dup_ein)
sort ein emp estabs
drop if dup_ein>0 & estabs<=estabs[_n+1] & emp<emp[_n+1] & ein==ein[_n+1]
drop dup_ein
}

*More estabs, same or larger emp
foreach num of numlist 1(1)5 {
duplicates tag ein, gen(dup_ein)
sort ein emp estabs
drop if dup_ein>0 & estabs<estabs[_n+1] & emp<=emp[_n+1] & ein==ein[_n+1]
drop dup_ein
}

*Older
foreach num of numlist 1(1)5 {
duplicates tag ein, gen(dup_ein)
sort ein firmage 
drop if dup_ein>0 & firmage<firmage[_n+1] & ein==ein[_n+1]
drop dup_ein
}

*Older or same age, later last year
foreach num of numlist 1(1)5 {
duplicates tag ein, gen(dup_ein)
sort ein firmage f_lastyear
drop if dup_ein>0 & firmage<=firmage[_n+1] & f_lastyear<f_lastyear[_n+1] & ein==ein[_n+1]
drop dup_ein
}

*Larger
foreach num of numlist 1(1)5 {
duplicates tag ein, gen(dup_ein)
sort ein emp estabs
drop if dup_ein>0 & emp<emp[_n+1] & ein==ein[_n+1]
drop dup_ein
}

duplicates tag ein, gen(dup_ein_tag)
tab dup_ein_tag
keep alpha ein
duplicates drop 

append using "`mu_eins'"

tempfile mu_eins
save "`mu_eins'"
}

*Now go back to getting the mu and su vars in the same place
use "`su_eins'", clear

merge m:1 ein using "`mu_eins'", update
drop if _merge==1 & alpha!=""
drop _merge
duplicates tag ein, gen(num_estabs_ein)
replace num_estabs_ein = num_estabs_ein + 1
compress
duplicates drop

gen firmid = "0" + ein if alpha==""
replace firmid = alpha + "0000" if firmid==""

save $datadir/BR/w2_ssl_`year'_usiris_2018q4_dedup, replace
}
